In [275]:
import pickle;
from imp import reload
import numpy as np; import pandas as pd
import lightgbm as lgb; import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import constants, utils, inference, evaluation
from joblib import Parallel, delayed
import multiprocessing
import os
pd.options.mode.chained_assignment = None
In [188]:
data = pd.read_hdf('/data/Instacart/eval.h5')
orders =data[['order_id']].drop_duplicates()
gid = data[constants.ID_COLS]
label = data['label']
In [3]:
def generate_pred(m, idx, is_sub=False):
'''
m: model path
idx: index of model
is_sub: bool indicator for submission
'''
print('Evaluating Model {} ...'.format(idx))
print('Model Path {}'.format(m))
bst = pickle.load(open(m, 'rb'))
feat = data[utils.get_feat_col(bst)] # data global variable
pred = utils.get_predition(bst, feat)
user_product = gid[['user_id', 'product_id', 'order_id']] # gid global variable
user_product['score'] = pred
if is_sub is False:
user_product['label'] = label
auc = roc_auc_score(label, pred) # label global variable
print('Evaluation AUC {}'.format(auc))
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
op, on = ['order_id'], how = 'left')
gold = evaluation.get_gold(user_product)
res = evaluation.evaluation(gold, op[['order_id', 'products']])
mf1 = res.f1score.mean()
print('F1 Optimization Result: mean-f1-score {}'.format(mf1))
eval_res= {'model_file':m.split('/')[-1], 'eval_auc': auc, 'eval_mf1': mf1}
return eval_res, pred
else:
return pred
In [327]:
%%time
eval_preds_8 = []
eval_infos_8 = []
for i,m in enumerate(constants.MODEL_PATH_8):
infos, preds = generate_pred(m, i, False)
# preds = generate_pred(m, i, True)
eval_preds_8.append(preds)
eval_infos_8.append(infos)
# eval_infos_5.append({'model_file':m.split('/')[-1], 'eval_auc': 0.840101, 'eval_mf1': 0.404459})
In [354]:
big_lgb_dart = pd.read_csv('./submission/eval_big_lgb_dart_0.8386003614599506.csv')
In [355]:
user_product = gid[['user_id', 'product_id', 'order_id']]
user_product['label'] = label
user_product['score'] = big_lgb_dart.score.values
gold = evaluation.get_gold(user_product)
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
op, on = ['order_id'], how = 'left')
res = evaluation.evaluation(gold, op[['order_id', 'products']])
In [356]:
print('F1 Optimization Result: mean-f1-score {}'.format(res.f1score.mean()))
In [328]:
eval_infos_8 = pd.DataFrame(eval_infos_8)
In [329]:
bagging_tree = pd.concat([eval_infos_2, eval_infos_4,
eval_infos.iloc[[22, 16, 15]],
eval_infos_3, eval_infos_5,
eval_infos_6, eval_infos_7,
eval_infos_8], axis =0)
In [330]:
def patch(x):
if x.startswith('xgb'):
return '/home/public/Instacart/xgb/' + x
else:
return '/home/public/Instacart/lgb/' + x
bagging_tree['model_file'] = bagging_tree['model_file'].apply(patch)
In [331]:
bagging_tree.to_hdf(constants.EVA_DATA_DIR + 'bagging_tree.h5', 'bagging', mode = 'w')
In [332]:
bagging_tree = bagging_tree.sort_values('eval_mf1')
In [333]:
bagging_tree.reset_index()
Out[333]:
In [334]:
pred_evals = []
for idx,m in enumerate(bagging_tree.model_file.values):
fp = constants.EVA_DATA_DIR + m.split('/')[-1] + 'pkl'
if os.path.exists(fp):
pred = pickle.load(open(fp, 'rb'))
else:
pred = generate_pred(m, idx, is_sub=True)
with open(constants.EVA_DATA_DIR + m.split('/')[-1] + 'pkl', 'wb') as f:
pickle.dump(pred, f, pickle.HIGHEST_PROTOCOL)
pred_evals.append(pred)
In [352]:
level0 = np.median(pred_evals[0:5], axis=0) # 0.4034
level1 = np.median(pred_evals[5:9], axis=0)
level2 = np.median([level0, level1] + pred_evals[9:12], axis=0) # 0.40429
level3 = np.median([level2] + pred_evals[12:14], axis=0)
In [353]:
%%time
user_product = gid[['user_id', 'product_id', 'order_id']]
user_product['label'] = label
user_product['score'] = level3
gold = evaluation.get_gold(user_product)
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
op, on = ['order_id'], how = 'left')
res = evaluation.evaluation(gold, op[['order_id', 'products']])
In [350]:
print('F1 Optimization Result: mean-f1-score {}'.format(res.f1score.mean()))
In [ ]:
op = user_product.copy()
op = utils.shing_f1_optim(op, low_bound=0.01, topk=200)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/lgb3_big_bag_shing.csv', index=False)
In [ ]: